#loading packages 
library(ezids)
library(ggplot2)
library(ggrepel)
library(gridExtra)
library(tibble)
library(dplyr)
library(tidyr)
library(psych)
library(corrplot)
library(lattice)
library(FNN)
library(gmodels)
library(caret)

#loading data 
NYweath <- data.frame(read.csv("data/NYC_weather_1869_2022.csv"))

#converting to R date format and adding columns for day, month, and year
NYweath$DATE <- as.Date(NYweath$DATE)
NYweath$day <- format(NYweath$DATE, format="%d")
NYweath$month <- format(NYweath$DATE, format="%m")
NYweath$year <- format(NYweath$DATE, format="%Y")
#converting temperature observations to numerical
NYweath$TMAX <- as.numeric(NYweath$TMAX)
NYweath$TMIN <- as.numeric(NYweath$TMIN)
NYweath$TAVG <- as.numeric(NYweath$TAVG)
NYweath$year <- as.numeric(NYweath$year)
#Making month a factor
NYweath$month <- as.factor(NYweath$month)
# subset data to desired variables
NYweath_sub <- subset(NYweath, select = c(DATE, day, month, year, TMAX, TMIN, TAVG, PRCP, SNOW)) 
#creating a subset for 1900 on
NYweath_00 <- subset(NYweath_sub, year > 1899)
xkabledplyhead(NYweath_00)
Head
DATE day month year TMAX TMIN TAVG PRCP SNOW
11265 1900-01-01 01 01 1900 20 14 NA 0.03 0.5
11266 1900-01-02 02 01 1900 23 13 NA 0.00 0.0
11267 1900-01-03 03 01 1900 26 19 NA 0.00 0.0
11268 1900-01-04 04 01 1900 32 19 NA 0.00 0.0
11269 1900-01-05 05 01 1900 39 29 NA 0.00 0.0
#new data loading from Emily, New York State population from 1900 on 
NYSPop_1900 <- data.frame(read.csv("data/NYSPop_1900-2021.csv"))
NYSPop_1900$Population <- as.numeric(NYSPop_1900$Population)
NYSPop_1900$Year <- as.numeric(NYSPop_1900$Year)

#creating a subset for 1900-2021
NYweath_00a <- subset(NYweath_sub, year > 1899)
NYweath_00a <- subset(NYweath_00a, year < 2022)

for(i in 1:length(NYweath_00a$year)){
  (NYweath_00a$Pop[i]= NYSPop_1900$Population[(which(NYSPop_1900$Year == NYweath_00a$year[i]))]
  )}

#New data loading from Emily, shootings
NYshoot <- data.frame(read.csv("data/Shooting_Counts_ERG.csv"))

#converting to R date format and adding columns for day, month, and year
NYshoot$DATE <- as.Date(NYshoot$Date, format = "%m/%d/%Y")

#cleaning shooting data, merging with NYweath_00a

NYweathshoot_06 <- subset (NYweath_00a, year > 2005)
NYweathshoot_06 <- subset (NYweathshoot_06, year < 2022)
#str(NYweathshoot_06)
#str(NYshoot)

NYweathshoot_06 <- full_join(NYshoot, NYweathshoot_06, by = "DATE")
NYweathshoot_06$day <- format(NYweathshoot_06$DATE, format="%d")
NYweathshoot_06$month <- format(NYweathshoot_06$DATE, format="%m")
NYweathshoot_06$year <- format(NYweathshoot_06$DATE, format="%Y")
NYweathshoot_06$day <- as.numeric(NYweathshoot_06$day)
NYweathshoot_06$month <- as.factor(NYweathshoot_06$month)
NYweathshoot_06$year <- as.numeric(NYweathshoot_06$year, format="%Y")
NYweathshoot_06 <- NYweathshoot_06 %>% mutate(Shootings = ifelse(is.na(Shootings), 0, Shootings))
summary(NYweathshoot_06)
##      Date             Shootings       Murders         DATE           
##  Length:5844        Min.   : 0.0   Min.   : 0    Min.   :2006-01-01  
##  Class :character   1st Qu.: 2.0   1st Qu.: 0    1st Qu.:2009-12-31  
##  Mode  :character   Median : 4.0   Median : 0    Median :2013-12-31  
##                     Mean   : 4.4   Mean   : 1    Mean   :2013-12-31  
##                     3rd Qu.: 6.0   3rd Qu.: 1    3rd Qu.:2017-12-31  
##                     Max.   :47.0   Max.   :12    Max.   :2021-12-31  
##                                    NA's   :435                       
##       day           month           year           TMAX            TMIN     
##  Min.   : 1.0   01     : 496   Min.   :2006   Min.   : 13.0   Min.   :-1.0  
##  1st Qu.: 8.0   03     : 496   1st Qu.:2010   1st Qu.: 48.0   1st Qu.:36.0  
##  Median :16.0   05     : 496   Median :2014   Median : 65.0   Median :49.0  
##  Mean   :15.7   07     : 496   Mean   :2014   Mean   : 63.3   Mean   :49.1  
##  3rd Qu.:23.0   08     : 496   3rd Qu.:2017   3rd Qu.: 79.0   3rd Qu.:64.0  
##  Max.   :31.0   10     : 496   Max.   :2021   Max.   :104.0   Max.   :84.0  
##                 (Other):2868                                                
##       TAVG           PRCP           SNOW            Pop          
##  Min.   : NA    Min.   :0.00   Min.   : 0.00   Min.   :19104631  
##  1st Qu.: NA    1st Qu.:0.00   1st Qu.: 0.00   1st Qu.:19376734  
##  Median : NA    Median :0.00   Median : 0.00   Median :19574362  
##  Mean   :NaN    Mean   :0.14   Mean   : 0.09   Mean   :19524848  
##  3rd Qu.: NA    3rd Qu.:0.06   3rd Qu.: 0.00   3rd Qu.:19640651  
##  Max.   : NA    Max.   :7.57   Max.   :27.30   Max.   :20154933  
##  NA's   :5844

CW ADD: Adding a ‘TOT_PRCP’ row that sums up the total precipitation between SNOW and PRCP. This row will be used in Question 3.

NYweath_final <- NYweath_00
NYweath_final$TOT_PRCP <- NYweath_00$PRCP + NYweath_00$SNOW

Introduction

In this project, we are digging into the relationship between human activity and weather in New York city. Our three driving questions are:

  1. How do changes in NYC weather patterns correlate to changes in population and economic activity over the same time frame?

  2. Are there statistically measurable changes in NYC air quality over time, and are they correlated to changes in daily maximum temperature observed in previous analysis?

  3. How do changes in weather patterns correlate to other local human activity, such as crime, reported COVID cases, and stock market performance?

Local weather and global and local human environmental footprint

Emily will re-do linear regression looking at measures of local and global human activity as regressors rather than year. She might also look into variable transformations (i.e., linear models fit to polynomials of regressors) to see if the response is best fit as linear or polynomial.

At the end of our exploratory data analysis, we developed a linear model of maximum daily temperature over time, with year as a linear regressor. This revealed to us that there is a statistically significant increase in average maximum temperatures over time. However, we do not suspect that time is the cause– rather, it is something else that has changed over time that has caused the warming in New York. We wanted to explore correlations with other, more direct proxies for human activity.

Our original fit used year as a numerical regressor and month as a categorical regressor. The resulting fit has an r-squared value of 0.775 and a slope of 0.025 degrees Fahrenheit per year, with all fit parameters’ p-values well below 0.05. The different intercepts for the each level of the categorical variable (the twelve months of the year) indicated that January is the coldest and July the hottest month in Central Park, with an average difference in maximum daily temperature of approximately 46 degrees Fahrenheit in any given year over this window.

maxTfit00_ym <- lm(formula = TMAX ~ year + month, data = NYweath_00a )
res00_ym <- residuals(maxTfit00_ym)
summary(maxTfit00_ym)  
## 
## Call:
## lm(formula = TMAX ~ year + month, data = NYweath_00a)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -39.37  -5.88  -0.16   5.63  37.50 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -10.78315    2.34463    -4.6  4.3e-06 ***
## year          0.02527    0.00119    21.2  < 2e-16 ***
## month02       1.35817    0.20898     6.5  8.2e-11 ***
## month03      10.13406    0.20406    49.7  < 2e-16 ***
## month04      21.45601    0.20575   104.3  < 2e-16 ***
## month05      32.29032    0.20406   158.2  < 2e-16 ***
## month06      40.85601    0.20575   198.6  < 2e-16 ***
## month07      46.00370    0.20406   225.4  < 2e-16 ***
## month08      44.14146    0.20406   216.3  < 2e-16 ***
## month09      37.44016    0.20575   182.0  < 2e-16 ***
## month10      26.46801    0.20406   129.7  < 2e-16 ***
## month11      14.59563    0.20575    70.9  < 2e-16 ***
## month12       3.71576    0.20406    18.2  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 8.87 on 44547 degrees of freedom
## Multiple R-squared:  0.773,  Adjusted R-squared:  0.773 
## F-statistic: 1.26e+04 on 12 and 44547 DF,  p-value: <2e-16

The two extremes and their linear models are plotted in the following figure.

#plot of just July and January

ggplot(NYweath_00a, aes(x = year, y = TMAX, color = month)) +
    geom_point() +
    scale_color_manual(values = c("01" = "purple4",
                                   "07" = "red"), na.value = NA) +
    geom_abline(aes(intercept = -11.05508, slope = 0.02539), col = "black", size = 1) + 
    geom_abline(aes(intercept = 34.98295, slope = 0.02539), col = "black", size = 1) +
  
    labs(
        x = "Year",
        y = "Maximum Daily Temperature",
        title = "Maximum Daily Temperature in Central Park") +
    xlab(label = "Year") +
    ylab(label = "Maximum daily temperature") +
    ggtitle(label = "Maximum Daily Temperature in Central Park")

Do other weather variables correlate to TMAX?

NYweath_cor <- subset(NYweath_00a, select = c(year, TMAX, PRCP, SNOW))
str(NYweath_cor)
## 'data.frame':    44560 obs. of  4 variables:
##  $ year: num  1900 1900 1900 1900 1900 1900 1900 1900 1900 1900 ...
##  $ TMAX: num  20 23 26 32 39 40 43 40 33 41 ...
##  $ PRCP: num  0.03 0 0 0 0 0 0 0.01 0 0 ...
##  $ SNOW: num  0.5 0 0 0 0 0 0 0 0 0 ...
weathcor <- cor(NYweath_cor, use = "pairwise.complete.obs")
corrplot::corrplot(weathcor)

cor
## function (x, y = NULL, use = "everything", method = c("pearson", 
##     "kendall", "spearman")) 
## {
##     na.method <- pmatch(use, c("all.obs", "complete.obs", "pairwise.complete.obs", 
##         "everything", "na.or.complete"))
##     if (is.na(na.method)) 
##         stop("invalid 'use' argument")
##     method <- match.arg(method)
##     if (is.data.frame(y)) 
##         y <- as.matrix(y)
##     if (is.data.frame(x)) 
##         x <- as.matrix(x)
##     if (!is.matrix(x) && is.null(y)) 
##         stop("supply both 'x' and 'y' or a matrix-like 'x'")
##     if (!(is.numeric(x) || is.logical(x))) 
##         stop("'x' must be numeric")
##     stopifnot(is.atomic(x))
##     if (!is.null(y)) {
##         if (!(is.numeric(y) || is.logical(y))) 
##             stop("'y' must be numeric")
##         stopifnot(is.atomic(y))
##     }
##     Rank <- function(u) {
##         if (length(u) == 0L) 
##             u
##         else if (is.matrix(u)) {
##             if (nrow(u) > 1L) 
##                 apply(u, 2L, rank, na.last = "keep")
##             else row(u)
##         }
##         else rank(u, na.last = "keep")
##     }
##     if (method == "pearson") 
##         .Call(C_cor, x, y, na.method, FALSE)
##     else if (na.method %in% c(2L, 5L)) {
##         if (is.null(y)) {
##             .Call(C_cor, Rank(na.omit(x)), NULL, na.method, method == 
##                 "kendall")
##         }
##         else {
##             nas <- attr(na.omit(cbind(x, y)), "na.action")
##             dropNA <- function(x, nas) {
##                 if (length(nas)) {
##                   if (is.matrix(x)) 
##                     x[-nas, , drop = FALSE]
##                   else x[-nas]
##                 }
##                 else x
##             }
##             .Call(C_cor, Rank(dropNA(x, nas)), Rank(dropNA(y, 
##                 nas)), na.method, method == "kendall")
##         }
##     }
##     else if (na.method != 3L) {
##         x <- Rank(x)
##         if (!is.null(y)) 
##             y <- Rank(y)
##         .Call(C_cor, x, y, na.method, method == "kendall")
##     }
##     else {
##         if (is.null(y)) {
##             ncy <- ncx <- ncol(x)
##             if (ncx == 0) 
##                 stop("'x' is empty")
##             r <- matrix(0, nrow = ncx, ncol = ncy)
##             for (i in seq_len(ncx)) {
##                 for (j in seq_len(i)) {
##                   x2 <- x[, i]
##                   y2 <- x[, j]
##                   ok <- complete.cases(x2, y2)
##                   x2 <- rank(x2[ok])
##                   y2 <- rank(y2[ok])
##                   r[i, j] <- if (any(ok)) 
##                     .Call(C_cor, x2, y2, 1L, method == "kendall")
##                   else NA
##                 }
##             }
##             r <- r + t(r) - diag(diag(r))
##             rownames(r) <- colnames(x)
##             colnames(r) <- colnames(x)
##             r
##         }
##         else {
##             if (length(x) == 0L || length(y) == 0L) 
##                 stop("both 'x' and 'y' must be non-empty")
##             matrix_result <- is.matrix(x) || is.matrix(y)
##             if (!is.matrix(x)) 
##                 x <- matrix(x, ncol = 1L)
##             if (!is.matrix(y)) 
##                 y <- matrix(y, ncol = 1L)
##             ncx <- ncol(x)
##             ncy <- ncol(y)
##             r <- matrix(0, nrow = ncx, ncol = ncy)
##             for (i in seq_len(ncx)) {
##                 for (j in seq_len(ncy)) {
##                   x2 <- x[, i]
##                   y2 <- y[, j]
##                   ok <- complete.cases(x2, y2)
##                   x2 <- rank(x2[ok])
##                   y2 <- rank(y2[ok])
##                   r[i, j] <- if (any(ok)) 
##                     .Call(C_cor, x2, y2, 1L, method == "kendall")
##                   else NA
##                 }
##             }
##             rownames(r) <- colnames(x)
##             colnames(r) <- colnames(y)
##             if (matrix_result) 
##                 r
##             else drop(r)
##         }
##     }
## }
## <bytecode: 0x7fb7c5bdc9b0>
## <environment: namespace:stats>

We have found a reasonable linear model for temperature over time, but can we look instead at the connection to human activities, rather than time? Can we use some aspect of human activity as a regressor and generate a reasonable model?

We looked to the Census for U.S. population data, but that is only reported decennially, so we looked for other sources. We found historical data back to 1960 for New York state online https://www.macrotrends.net/cities/23083/new-york-city/population. Because this source is not known to us, we validated it against decennial census data.

A bunch of linear models…

#LM1
maxTfit00_m <- lm(formula = TMAX ~ month, data = NYweath_00a)
summary(maxTfit00_m)  
## 
## Call:
## lm(formula = TMAX ~ month, data = NYweath_00a)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -40.47  -5.90  -0.19   5.78  37.89 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   38.754      0.145  267.24   <2e-16 ***
## month02        1.358      0.210    6.47    1e-10 ***
## month03       10.134      0.205   49.41   <2e-16 ***
## month04       21.456      0.207  103.76   <2e-16 ***
## month05       32.290      0.205  157.45   <2e-16 ***
## month06       40.856      0.207  197.58   <2e-16 ***
## month07       46.004      0.205  224.32   <2e-16 ***
## month08       44.141      0.205  215.24   <2e-16 ***
## month09       37.440      0.207  181.06   <2e-16 ***
## month10       26.468      0.205  129.06   <2e-16 ***
## month11       14.596      0.207   70.58   <2e-16 ***
## month12        3.716      0.205   18.12   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 8.92 on 44548 degrees of freedom
## Multiple R-squared:  0.771,  Adjusted R-squared:  0.771 
## F-statistic: 1.36e+04 on 11 and 44548 DF,  p-value: <2e-16
#LM4
maxTfit00_all <- lm(formula = TMAX ~ year + month + PRCP, data = NYweath_00a)
summary(maxTfit00_all)  
## 
## Call:
## lm(formula = TMAX ~ year + month + PRCP, data = NYweath_00a)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -39.50  -5.88  -0.19   5.60  37.35 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -11.12661    2.34225   -4.75  2.0e-06 ***
## year          0.02551    0.00119   21.39  < 2e-16 ***
## month02       1.36397    0.20874    6.53  6.5e-11 ***
## month03      10.15636    0.20384   49.82  < 2e-16 ***
## month04      21.47723    0.20553  104.49  < 2e-16 ***
## month05      32.30716    0.20384  158.49  < 2e-16 ***
## month06      40.87389    0.20553  198.87  < 2e-16 ***
## month07      46.03686    0.20386  225.83  < 2e-16 ***
## month08      44.17913    0.20387  216.71  < 2e-16 ***
## month09      37.46350    0.20554  182.27  < 2e-16 ***
## month10      26.48077    0.20384  129.91  < 2e-16 ***
## month11      14.60290    0.20553   71.05  < 2e-16 ***
## month12       3.72867    0.20384   18.29  < 2e-16 ***
## PRCP         -1.18037    0.11749  -10.05  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 8.86 on 44546 degrees of freedom
## Multiple R-squared:  0.773,  Adjusted R-squared:  0.773 
## F-statistic: 1.17e+04 on 13 and 44546 DF,  p-value: <2e-16
#maxTfit00_all_intrxn <- lm(formula = TMAX ~ year + month*day + PRCP + SNOW, data = NYweath_00a)
#summary(maxTfit00_all)  
#anova(maxTfit00_m, maxTfit00_ym)
#anova(maxTfit00_all, maxTfit00_all_intrxn)
#LM2
maxTfit00_pop <- lm(formula = TMAX ~ Pop + month, data = NYweath_00a)
summary(maxTfit00_pop)  
## 
## Call:
## lm(formula = TMAX ~ Pop + month, data = NYweath_00a)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -39.23  -5.86  -0.17   5.65  37.75 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 3.51e+01   2.21e-01   158.7  < 2e-16 ***
## Pop         2.38e-07   1.11e-08    21.5  < 2e-16 ***
## month02     1.36e+00   2.09e-01     6.5  8.1e-11 ***
## month03     1.01e+01   2.04e-01    49.7  < 2e-16 ***
## month04     2.15e+01   2.06e-01   104.3  < 2e-16 ***
## month05     3.23e+01   2.04e-01   158.3  < 2e-16 ***
## month06     4.09e+01   2.06e-01   198.6  < 2e-16 ***
## month07     4.60e+01   2.04e-01   225.5  < 2e-16 ***
## month08     4.41e+01   2.04e-01   216.3  < 2e-16 ***
## month09     3.74e+01   2.06e-01   182.0  < 2e-16 ***
## month10     2.65e+01   2.04e-01   129.7  < 2e-16 ***
## month11     1.46e+01   2.06e-01    71.0  < 2e-16 ***
## month12     3.72e+00   2.04e-01    18.2  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 8.87 on 44547 degrees of freedom
## Multiple R-squared:  0.773,  Adjusted R-squared:  0.773 
## F-statistic: 1.26e+04 on 12 and 44547 DF,  p-value: <2e-16
#maxTfit00_pop_all <- lm(formula = TMAX ~ Pop + month + PRCP, data = NYweath_00a)
#summary(maxTfit00_pop)  
#plot of NYS Pop over time

ggplot(NYweath_00a, aes(x = year, y = Pop)) +
    geom_point() +
#    geom_abline(aes(intercept = -11.05508, slope = 0.02539), col = "black", size = 1) + 
    labs(
        x = "Year",
        y = "New York State Population",
        title = "Annual Population of New York State") +
    xlab(label = "Year") +
    ylab(label = "New York State Population") +
    ggtitle(label = "Annual Population in New York State")

Local weather and local environmental and climate factors

Let’s pull in our air quality data. It contains measurements of daily PM2.5 and air quality index values taken from various locations around Manhattan.

PM2.5 includes particles less than or equal to 2.5 micrometers and is also called fine particle pollution. The AQI is an index for reporting daily air quality. It tells how clean or polluted the air is, and what associated health effects might be a concern, especially for ground-level ozone and particle pollution.

Load the new data and have a look at its structure:

DailyAQ_00_22 <- data.frame(read.csv("data/daily-AQ-NY-00-20.csv"))
DailyAQ_00_22 <- DailyAQ_00_22[c('Date', 'Daily.Mean.PM2.5.Concentration', 'DAILY_AQI_VALUE')]
colnames(DailyAQ_00_22) <- c('DATE', 'PM2.5', 'AQI')
str(DailyAQ_00_22)
## 'data.frame':    7097 obs. of  3 variables:
##  $ DATE : chr  "1/1/00" "1/2/00" "1/3/00" "1/4/00" ...
##  $ PM2.5: num  39.8 41.3 30.8 16.4 7.8 23.1 18.2 27 19.4 9.7 ...
##  $ AQI  : int  112 115 90 60 33 74 64 82 66 40 ...
xkablesummary(DailyAQ_00_22)
Table: Statistics summary.
DATE PM2.5 AQI
Min Length:7097 Min. :-1.2 Min. : 0
Q1 Class :character 1st Qu.: 5.9 1st Qu.: 25
Median Mode :character Median : 9.0 Median : 38
Mean NA Mean :10.9 Mean : 41
Q3 NA 3rd Qu.:13.6 3rd Qu.: 54
Max NA Max. :86.0 Max. :167
xkabledplyhead(DailyAQ_00_22)
Head
DATE PM2.5 AQI
1/1/00 39.8 112
1/2/00 41.3 115
1/3/00 30.8 90
1/4/00 16.4 60
1/5/00 7.8 33

We need to convert the date from a character string to an R type. We also calculate year-over-year growth rates for both daily PM2.5 and AQI and store them in a column.

We have about 7,000 observations between the years 2000 and 2022. A few plots to help us visualize the data:

# distribution plot of pmi2.5 and daily AQI
mean_pm25 <- mean(DailyAQ_00_22$PM2.5)
mean_aqi <- mean(DailyAQ_00_22$AQI)

# TODO: combine plots into one frame
ggplot(DailyAQ_00_22) +
  geom_histogram(aes(x=PM2.5), na.rm=TRUE, alpha=0.5, color="black", fill='#BD2AE2', bins=100, binwidth=2) +
  geom_vline(xintercept=mean_pm25, color="black", size=1, linetype=5, show.legend=FALSE) +
  annotate("text", x=mean_pm25 + 9, y=1000, label=paste(round(mean_pm25, 2)), angle=0, size=4, color="black") +
  labs(title="Distribution of Daily PM2.5 Measurements", x="ug/m3 LC", y="Count")

ggplot(DailyAQ_00_22) +
  geom_histogram(aes(x=AQI), na.rm=TRUE, alpha=0.5, color="black", fill='#2DD164', bins=50, binwidth=5) +
  geom_vline(xintercept=mean_aqi, color="black", size=1, linetype=5, show.legend=FALSE) +
  annotate("text", x=mean_aqi + 9, y=625, label=paste(round(mean_aqi, 2)), angle=0, size=4, color="black") +
  labs(title="Distribution of Daily AQI Level", x="", y="Count")

# TODO: group these in same figure, separate plots
ggplot(DailyAQ_00_22_Yearly_Growth, aes(group=1)) +
  geom_line(aes(x = year, y = pm2.5_diffRate), na.rm = T, stat = "identity", color="#290DDA", size=1) +
  geom_point(aes(x = year, y = pm2.5_diffRate), na.rm = TRUE, fill="#124CF2", shape = 23) +
  labs(title="PM2.5 particulate year-over-year rate in NYC", x="Year", y="ug/m3 LC") +
  theme(
    axis.title.y = element_text(color = "#043008", size = 13),
    axis.title.y.right = element_text(color = "#E6E930", size = 13)
  )

ggplot(DailyAQ_00_22_Yearly_Growth, aes(group=1)) +
  geom_line(aes(x = year, y = aqi_diffRate), na.rm = T, stat="identity", color="#043008", size=1) +
  geom_point(aes(x = year, y = aqi_diffRate), na.rm = TRUE, fill="#E6E930", shape = 23) +
  labs(title="AQI year-over-year rate in NYC", x="Year", y="AQI") +
  theme(
    axis.title.y = element_text(color = "#043008", size = 13),
    axis.title.y.right = element_text(color = "#E6E930", size = 13)
  )

# ggplot(DailyAQ_00_22_Yearly_Growth, aes(group=1)) +
#   labs(title="AQI and growth rate and growht/decay rate by year in NYC", x="Year", y="ug/m3 LC") +
#   scale_y_continuous(sec.axis = sec_axis(~., name="Year-over-year Diff (%)")) +
#   theme(
#     axis.title.y = element_text(color = "#2DD164", size = 13),
#     axis.title.y.right = element_text(color = "#E6E930", size = 13)
#   )

Next, we combine our new dataset with the NYC weather data based on the date. The days without a matching air quality measurement will be dropped after merge.

# merge data frame by date
DailyAQ_merged <- merge(DailyAQ_00_22, NYweath_00, by="DATE")
# select required columns
DailyAQ_merged <- DailyAQ_merged[ , c('DATE', 'year.x', 'month', 'PM2.5', 'AQI', 'TMAX', 'TMIN', 'PRCP', 'SNOW')]
colnames(DailyAQ_merged)[2] <- "year"
str(DailyAQ_merged)
## 'data.frame':    7063 obs. of  9 variables:
##  $ DATE : Date, format: "2000-01-01" "2000-01-02" ...
##  $ year : num  2000 2000 2000 2000 2000 2000 2000 2000 2000 2000 ...
##  $ month: Factor w/ 12 levels "01","02","03",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ PM2.5: num  39.8 41.3 30.8 16.4 7.8 23.1 18.2 27 19.4 9.7 ...
##  $ AQI  : int  112 115 90 60 33 74 64 82 66 40 ...
##  $ TMAX : num  50 60 64 60 47 49 38 51 58 52 ...
##  $ TMIN : num  34 43 51 46 29 35 30 37 44 40 ...
##  $ PRCP : num  0 0 0 0.7 0 0 0 0.02 0.84 0 ...
##  $ SNOW : num  0 0 0 0 0 0 0 0 0 0 ...
xkablesummary(DailyAQ_merged)
Table: Statistics summary.
DATE year month PM2.5 AQI TMAX TMIN PRCP SNOW
Min Min. :2000-01-01 Min. :2000 03 : 634 Min. :-1.2 Min. : 0.0 Min. : 13.0 Min. :-1.0 Min. :0.00 Min. : 0.00
Q1 1st Qu.:2007-02-06 1st Qu.:2007 01 : 617 1st Qu.: 5.9 1st Qu.: 25.0 1st Qu.: 48.0 1st Qu.:36.0 1st Qu.:0.00 1st Qu.: 0.00
Median Median :2012-03-16 Median :2012 05 : 615 Median : 9.1 Median : 38.0 Median : 64.0 Median :48.0 Median :0.00 Median : 0.00
Mean Mean :2011-12-20 Mean :2011 04 : 611 Mean :10.9 Mean : 41.1 Mean : 62.6 Mean :48.4 Mean :0.13 Mean : 0.09
Q3 3rd Qu.:2017-05-27 3rd Qu.:2017 12 : 610 3rd Qu.:13.6 3rd Qu.: 54.0 3rd Qu.: 79.0 3rd Qu.:63.0 3rd Qu.:0.05 3rd Qu.: 0.00
Max Max. :2022-09-26 Max. :2022 07 : 579 Max. :86.0 Max. :167.0 Max. :103.0 Max. :83.0 Max. :7.57 Max. :27.30
NA NA NA (Other):3397 NA NA NA NA NA NA

Linear Model with daily air quality and weather variables

# subset to numerical variables
DailyAQ_numerics <- DailyAQ_merged[ , c('PM2.5', 'AQI', 'TMAX', 'TMIN', 'PRCP', 'SNOW', 'year')]
# combine PRCP and SNOW into single value
#DailyAQ_numerics$PRCP <- DailyAQ_numerics$PRCP + DailyAQ_numerics$SNOW
#DailyAQ_numerics <- subset(DailyAQ_numerics, select = -c(SNOW))
DailyAQ_numerics$year <- DailyAQ_numerics$year - 2000 

Correlation analysis

Lattice pairs plot

pairs(DailyAQ_numerics)

pairs.panels(DailyAQ_numerics, 
  method = "pearson", # correlation method
  hist.col = "red", # set histogram color
  density = TRUE,  # show density plots
  ellipses = TRUE, # show correlation ellipses
  smoother = TRUE,
  lm = TRUE,
  main = "Pairs Plot Of Weather and Air Quality Numerical Variables",
  cex.labels=0.75
)

Another way to look at correlation using the corrplot function:

DailyAQ_cor <- cor(DailyAQ_numerics)
corrplot(DailyAQ_cor, method="number", title="Correlation Plot Of Weather and Air Quality Numerical Variables", mar = c(2, 2, 2, 2))

From the pearson correlation plot above, we can see a significantly large, positive correlation between PM2.5 concentrations and the daily AQI values. This is expected as PM2.5 are heavily weighed in calculations of AQI. Unfortunately, the correlation significance among our weather and air quality variables is relatively weak. However, we will still attempt a linear model between them below.

# yearly average and year-over year growth of daily AQI and PM2.5
ggplot(DailyAQ_00_22_Yearly_Growth) +
  geom_line(aes(x = year, y = aqi_avg), stat="identity", color="#2DD164", size=1) +
  geom_point(aes(x = year, y = aqi_avg), na.rm = TRUE, fill="#457108", shape = 21) +
  labs(title="Average AQI by year in NYC", x="Year", y="AQI value")

ggplot(DailyAQ_00_22_Yearly_Growth) +
  geom_line(aes(x = year, y = pm2.5_avg), stat="identity", color="#BD2AE2", size=1) +
  geom_point(aes(x = year, y = pm2.5_avg), na.rm = TRUE, fill="#124CF2", shape = 21) +
  labs(title="Average PM2.5 particulate amount by year in NYC", x="Year", y="Year-over-year Diff (%)")

Linear models

Let’s start by creating a linear model to describe the relationship between AQI and year.

aqi_fit <- lm(AQI ~ year, data = DailyAQ_numerics)
summary(aqi_fit)
## 
## Call:
## lm(formula = AQI ~ year, data = DailyAQ_numerics)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -50.34 -13.11  -1.49  11.06 126.83 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  61.4369     0.4497   136.6   <2e-16 ***
## year         -1.7748     0.0342   -51.9   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 18.4 on 7061 degrees of freedom
## Multiple R-squared:  0.276,  Adjusted R-squared:  0.276 
## F-statistic: 2.69e+03 on 1 and 7061 DF,  p-value: <2e-16
xkabledply(aqi_fit, title = paste("First Linear Model: ", format( formula(aqi_fit) )))
First Linear Model: AQI ~ year
Estimate Std. Error t value Pr(>|t|)
(Intercept) 61.44 0.4497 136.6 0
year -1.77 0.0342 -51.8 0

The coefficient for the date regressor is significant, and has a negative effect on daily AQI by a very small factor of .005. Although the p-value of the F-statistic is significant, date still only explains 28% of the variability in daily AQI measurements.

ggplot(DailyAQ_00_22, aes(x = year, y = AQI)) + 
  geom_point(alpha = 0.5, color = "#2DD164", position = "jitter") +
  labs(x = "Year", y = "AQI Value", title = "Daily AQI Values From 2000-2022 With Trend Line") +
  geom_smooth(method = 'lm', formula = 'y ~ x', color = "black", fill="black")

The plot displays a slightly downward treng in daily AQI, but there is a lot of noise distorting the fit.

Adding month as a categorical regressor

In our first analysis, we analyzed linear trends of TMAX over time and determined a slight positive correlation observed over the years 1900-2022. Based on that fit, we hypothesized that seasonality trends had an impact on model performance.

We believe seasonality also effects daily AQI measurements.

# NYC weather - Avg TMAX by month
NYweath_Monthly_Avg <- NYweath_00 %>%
  group_by(month) %>%
  summarize(avg_max_temp = mean(TMAX, na.rm=T),
            avg_min_temp = mean(TMIN, na.rm=T))


ggplot(NYweath_Monthly_Avg, aes(x = as.numeric(month), y = avg_max_temp)) +
  geom_line(color="#F21E1E", size = 2) +
  geom_point(na.rm = TRUE, fill="#126BF4", shape = 21, size = 4) +
  labs(title="Average TMAX By Month in NYC", x="Month", y="Temperature (°F)") +
  scale_x_continuous(name = "Month",
                     breaks = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12))

DailyAQ_monthly <- DailyAQ_merged %>%
  group_by(month) %>%
  summarize(pm2.5_avg = mean(PM2.5, na.rm=T),
            aqi_avg = mean(AQI, na.rm=T))

# calculate growth/decay rates month-over-month
DailyAQ_monthly <- DailyAQ_monthly %>%
  mutate(pm2.5_diffRate = ((pm2.5_avg - lag(pm2.5_avg)) / pm2.5_avg) * 100,
         aqi_diffRate = ((aqi_avg - lag(aqi_avg)) / aqi_avg) * 100
      )
# populate January rates based on December
DailyAQ_monthly[1, 4]$pm2.5_diffRate <- ((DailyAQ_monthly$pm2.5_avg[1] - DailyAQ_monthly$pm2.5_avg[12]) /  DailyAQ_monthly$pm2.5_avg[1]) * 100
DailyAQ_monthly[1, 5]$aqi_diffRate <- ((DailyAQ_monthly$aqi_avg[1] - DailyAQ_monthly$aqi_avg[12]) /  DailyAQ_monthly$aqi_avg[1]) * 100

# yearly average and year-over year growth of daily AQI and PM2.5
# TODO: combine with month-over-month change plot
ggplot(DailyAQ_monthly, aes(x = as.numeric(month), y = aqi_avg)) +
  geom_line(color="#47ABE9", size = 2) +
  geom_point(na.rm = TRUE, fill="#C10808", shape = 21, size = 4) +
  labs(title="Average AQI By Month in NYC", x="Month", y="AQI") +
  scale_x_continuous(name = "Month",
                     breaks = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12))

ggplot(DailyAQ_monthly, aes(x = as.numeric(month), y = aqi_diffRate)) +
  geom_line(na.rm = TRUE, stat="identity", color="#043008", size=2) +
  geom_point(na.rm = TRUE, fill="#E6E930", shape = 21) +
  labs(title="Average AQI month-over-month change rate", x="Month", y="AQI") +
  scale_x_continuous(name = "Month",
                     breaks = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12))

Let’s modify our last model to attempt to fit seasonality by adding month as a categorical regressor and our variable-of-interest from the last project - TMAX - to predict daily AQI.

aqi_fit2 <- lm(AQI ~ TMAX + month, data = DailyAQ_merged)
summary(aqi_fit2)
## 
## Call:
## lm(formula = AQI ~ TMAX + month, data = DailyAQ_merged)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -43.72 -14.66  -3.03  11.52 124.66 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  24.9722     1.3444   18.58  < 2e-16 ***
## TMAX          0.6781     0.0271   25.03  < 2e-16 ***
## month02      -5.9768     1.1604   -5.15  2.7e-07 ***
## month03     -19.3975     1.1672  -16.62  < 2e-16 ***
## month04     -33.0288     1.2871  -25.66  < 2e-16 ***
## month05     -37.9619     1.4304  -26.54  < 2e-16 ***
## month06     -38.8086     1.5925  -24.37  < 2e-16 ***
## month07     -37.7940     1.6857  -22.42  < 2e-16 ***
## month08     -40.4565     1.6563  -24.43  < 2e-16 ***
## month09     -43.7962     1.5422  -28.40  < 2e-16 ***
## month10     -34.8408     1.3458  -25.89  < 2e-16 ***
## month11     -19.9973     1.2209  -16.38  < 2e-16 ***
## month12      -7.9325     1.1470   -6.92  5.1e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 20 on 7050 degrees of freedom
## Multiple R-squared:  0.149,  Adjusted R-squared:  0.147 
## F-statistic:  103 on 12 and 7050 DF,  p-value: <2e-16
xkabledply(aqi_fit2, title = paste("Second Linear Model: ", format( formula(aqi_fit2) )))
Second Linear Model: AQI ~ TMAX + month
Estimate Std. Error t value Pr(>|t|)
(Intercept) 24.972 1.3444 18.58 0
TMAX 0.678 0.0271 25.03 0
month02 -5.977 1.1604 -5.15 0
month03 -19.398 1.1672 -16.62 0
month04 -33.029 1.2871 -25.66 0
month05 -37.962 1.4304 -26.54 0
month06 -38.809 1.5925 -24.37 0
month07 -37.794 1.6857 -22.42 0
month08 -40.456 1.6563 -24.43 0
month09 -43.796 1.5422 -28.40 0
month10 -34.841 1.3458 -25.89 0
month11 -19.997 1.2209 -16.38 0
month12 -7.933 1.1470 -6.92 0

The regression coefficient for TMAX is significant and positively correlated, with each degree increase resulting in AQI increasing by a factor of 0.68. All months, when compared to January, have a negative impact on AQI, with September having the largest difference. The p-value of the model’s F-statistic is also significant, allowing us to reject the null hypothesis and conclude that there’s a significant relationship between our chosen predictors and the daily AQI value. However, the \(R^2\) for our model is only .149, which indicates that only 14.7% of the variation in daily AQI can be explained by TMAX and month.

Seasonality can cause a poor linear model. Properly testing it would require developing a seasonality time-series model to properly fit the data.

Check for multicollinearity

# model VIF scores
xkablevif(aqi_fit2, title = "Model 2 VIF")
Model 2 VIF
month02 month03 month04 month05 month06 month07 month08 month09 month10 month11 month12 TMAX
1.78 1.98 2.32 2.88 3.36 3.79 3.59 3.01 2.38 1.96 1.84 4.25

The VIF values of all regressors are acceptable.

k-NN model to predict month based on weather and air quality data

A k-NN model can help us further analyze the seasonality effect by attempting to predict the month based on AQI and TMAX variables.

Let’s start with plotting variables and discerning month:

ggplot(DailyAQ_merged) +
    geom_point(aes(x=AQI, y=TMAX, color=month), alpha = 0.7) +
    labs(title = "Daily Maximum Temperature vs Daily Air Quality Index Value Distinguished By Month",
         x = "Daily AQI Value",
         y = "Daily Maximum Temperature (F)") +
  scale_color_brewer(palette = "Paired")

Center and scale our data

DailyAQ_scaled <- as.data.frame(scale(DailyAQ_merged[4:9], center = TRUE, scale = TRUE))
str(DailyAQ_scaled)
## 'data.frame':    7063 obs. of  6 variables:
##  $ PM2.5: num  3.839 4.039 2.642 0.727 -0.417 ...
##  $ AQI  : num  3.282 3.421 2.264 0.876 -0.373 ...
##  $ TMAX : num  -0.6996 -0.1462 0.0752 -0.1462 -0.8656 ...
##  $ TMIN : num  -0.872 -0.328 0.155 -0.147 -1.173 ...
##  $ PRCP : num  -0.356 -0.356 -0.356 1.502 -0.356 ...
##  $ SNOW : num  -0.113 -0.113 -0.113 -0.113 -0.113 ...

Create train and test data sets with 4:1 splits, as well as label sets.

set.seed(1000)
DailyAQ_sample <- sample(2, nrow(DailyAQ_scaled), replace=TRUE, prob=c(0.80, 0.20))

DailyAQ_training <- DailyAQ_scaled[DailyAQ_sample == 1, ]
DailyAQ_test <- DailyAQ_scaled[DailyAQ_sample == 2, ]

DailyAQ_trainLabels <- DailyAQ_merged[DailyAQ_sample == 1, 3]
DailyAQ_testLabels <- DailyAQ_merged[DailyAQ_sample == 2, 3]

nrow(DailyAQ_training)
## [1] 5664
nrow(DailyAQ_test)
## [1] 1399

Build kNN model

# set kval
kval <- 3

# build model
DailyAQ_pred <- knn(train = DailyAQ_training,
                    test = DailyAQ_test,
                    cl = DailyAQ_trainLabels,
                    k = kval)

# confusion matrix
DailyAQ_confusionMatrix <- caret::confusionMatrix(DailyAQ_pred, reference = DailyAQ_testLabels)
DailyAQ_pred_accuracy <- DailyAQ_confusionMatrix$overall['Accuracy']


xkabledply(as.matrix(DailyAQ_confusionMatrix), title = paste("ConfusionMatrix for k = ", kval))
ConfusionMatrix for k = 3
01 02 03 04 05 06 07 08 09 10 11 12
01 60 38 16 7 1 0 0 0 0 5 13 50
02 24 33 38 10 0 0 0 0 0 4 18 18
03 14 13 37 26 7 1 0 0 2 11 27 11
04 2 3 20 52 22 6 2 2 3 36 25 6
05 0 1 2 20 38 19 10 8 28 27 7 0
06 0 0 0 1 8 39 34 25 25 7 1 0
07 0 0 0 0 4 15 41 34 13 3 0 0
08 0 0 0 2 2 15 23 24 13 3 0 0
09 0 0 1 1 9 14 9 10 17 4 1 0
10 0 3 2 6 8 5 1 0 3 24 8 1
11 2 1 9 6 4 1 0 0 0 6 15 6
12 16 13 9 3 0 0 0 0 0 2 5 19
xkabledply(data.frame(DailyAQ_confusionMatrix$byClass), title=paste("k = ", kval))
k = 3
Sensitivity Specificity Pos.Pred.Value Neg.Pred.Value Precision Recall F1 Prevalence Detection.Rate Detection.Prevalence Balanced.Accuracy
Class: 01 0.508 0.898 0.316 0.952 0.316 0.508 0.390 0.0843 0.0429 0.1358 0.704
Class: 02 0.314 0.913 0.228 0.943 0.228 0.314 0.264 0.0751 0.0236 0.1036 0.614
Class: 03 0.276 0.911 0.248 0.922 0.248 0.276 0.262 0.0958 0.0264 0.1065 0.594
Class: 04 0.388 0.900 0.290 0.933 0.290 0.388 0.332 0.0958 0.0372 0.1279 0.644
Class: 05 0.369 0.906 0.237 0.948 0.237 0.369 0.289 0.0736 0.0272 0.1144 0.637
Class: 06 0.339 0.921 0.279 0.940 0.279 0.339 0.306 0.0822 0.0279 0.1001 0.630
Class: 07 0.342 0.946 0.373 0.939 0.373 0.342 0.356 0.0858 0.0293 0.0786 0.644
Class: 08 0.233 0.955 0.293 0.940 0.293 0.233 0.260 0.0736 0.0172 0.0586 0.594
Class: 09 0.164 0.962 0.258 0.935 0.258 0.164 0.200 0.0743 0.0122 0.0472 0.563
Class: 10 0.182 0.971 0.393 0.919 0.393 0.182 0.249 0.0944 0.0172 0.0436 0.576
Class: 11 0.125 0.973 0.300 0.922 0.300 0.125 0.176 0.0858 0.0107 0.0357 0.549
Class: 12 0.171 0.963 0.284 0.931 0.284 0.171 0.213 0.0793 0.0136 0.0479 0.567

How does k affect classification accuracy?

evaluateK = function(k, train_set, val_set, train_class, val_class){
  
  # Build knn with k neighbors considered.
  set.seed(1000)
  class_knn = knn(train = train_set,    #<- training set cases
                  test = val_set,       #<- test set cases
                  cl = train_class,     #<- category for classification
                  k = k)                #<- number of neighbors considered
  
  tab = table(class_knn, val_class)
  
  # Calculate the accuracy.
  accu = sum(tab[row(tab) == col(tab)]) / sum(tab)                         
  cbind(k = k, accuracy = accu)
}

# call evaluateK function for each odd k-value between 1 to 21
knn_different_k = sapply(seq(1, 21, by = 2),
                         function(x) evaluateK(x, 
                                             train_set = DailyAQ_training,
                                             val_set = DailyAQ_test,
                                             train_class = DailyAQ_trainLabels,
                                             val_class = DailyAQ_testLabels))

# Reformat the results
knn_different_k = data.frame(k = knn_different_k[1,],
                             accuracy = knn_different_k[2,])
ggplot(knn_different_k, aes(x = k, y = accuracy)) +
  geom_line(color = "orange", size = 1.5) +
  geom_point(size = 3) + 
  labs(title = "kNN Model Accuracy vs k-value",
       x = "Model k-value",
       y = "Accuracy")

It seems 13-nearest neighbors is a decent choice because that’s the greatest improvement in predictive accuracy before the incremental improvement trails off. With an accuracy of 0.32, our model predicting month based on TMAX and AQI is not a strong fit.

Local weather and local human social and economic activity

# correlation between shooting & weather data from Emily
NYweathshoot_06_cor <- subset(NYweathshoot_06, select = c(year, day, TMAX, PRCP, SNOW, Shootings, Murders))
str(NYweathshoot_06_cor)
## 'data.frame':    5844 obs. of  7 variables:
##  $ year     : num  2006 2006 2006 2006 2006 ...
##  $ day      : num  1 2 3 4 5 6 7 8 9 10 ...
##  $ TMAX     : num  42 48 40 38 50 43 35 46 60 49 ...
##  $ PRCP     : num  0 0.63 1.13 0 0.05 0 0 0 0 0 ...
##  $ SNOW     : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Shootings: num  8 4 4 4 4 4 2 4 9 5 ...
##  $ Murders  : int  4 1 1 0 0 0 1 1 5 0 ...
shootcor <- cor(NYweathshoot_06_cor, use = "pairwise.complete.obs")
corrplot::corrplot(shootcor)

cor
## function (x, y = NULL, use = "everything", method = c("pearson", 
##     "kendall", "spearman")) 
## {
##     na.method <- pmatch(use, c("all.obs", "complete.obs", "pairwise.complete.obs", 
##         "everything", "na.or.complete"))
##     if (is.na(na.method)) 
##         stop("invalid 'use' argument")
##     method <- match.arg(method)
##     if (is.data.frame(y)) 
##         y <- as.matrix(y)
##     if (is.data.frame(x)) 
##         x <- as.matrix(x)
##     if (!is.matrix(x) && is.null(y)) 
##         stop("supply both 'x' and 'y' or a matrix-like 'x'")
##     if (!(is.numeric(x) || is.logical(x))) 
##         stop("'x' must be numeric")
##     stopifnot(is.atomic(x))
##     if (!is.null(y)) {
##         if (!(is.numeric(y) || is.logical(y))) 
##             stop("'y' must be numeric")
##         stopifnot(is.atomic(y))
##     }
##     Rank <- function(u) {
##         if (length(u) == 0L) 
##             u
##         else if (is.matrix(u)) {
##             if (nrow(u) > 1L) 
##                 apply(u, 2L, rank, na.last = "keep")
##             else row(u)
##         }
##         else rank(u, na.last = "keep")
##     }
##     if (method == "pearson") 
##         .Call(C_cor, x, y, na.method, FALSE)
##     else if (na.method %in% c(2L, 5L)) {
##         if (is.null(y)) {
##             .Call(C_cor, Rank(na.omit(x)), NULL, na.method, method == 
##                 "kendall")
##         }
##         else {
##             nas <- attr(na.omit(cbind(x, y)), "na.action")
##             dropNA <- function(x, nas) {
##                 if (length(nas)) {
##                   if (is.matrix(x)) 
##                     x[-nas, , drop = FALSE]
##                   else x[-nas]
##                 }
##                 else x
##             }
##             .Call(C_cor, Rank(dropNA(x, nas)), Rank(dropNA(y, 
##                 nas)), na.method, method == "kendall")
##         }
##     }
##     else if (na.method != 3L) {
##         x <- Rank(x)
##         if (!is.null(y)) 
##             y <- Rank(y)
##         .Call(C_cor, x, y, na.method, method == "kendall")
##     }
##     else {
##         if (is.null(y)) {
##             ncy <- ncx <- ncol(x)
##             if (ncx == 0) 
##                 stop("'x' is empty")
##             r <- matrix(0, nrow = ncx, ncol = ncy)
##             for (i in seq_len(ncx)) {
##                 for (j in seq_len(i)) {
##                   x2 <- x[, i]
##                   y2 <- x[, j]
##                   ok <- complete.cases(x2, y2)
##                   x2 <- rank(x2[ok])
##                   y2 <- rank(y2[ok])
##                   r[i, j] <- if (any(ok)) 
##                     .Call(C_cor, x2, y2, 1L, method == "kendall")
##                   else NA
##                 }
##             }
##             r <- r + t(r) - diag(diag(r))
##             rownames(r) <- colnames(x)
##             colnames(r) <- colnames(x)
##             r
##         }
##         else {
##             if (length(x) == 0L || length(y) == 0L) 
##                 stop("both 'x' and 'y' must be non-empty")
##             matrix_result <- is.matrix(x) || is.matrix(y)
##             if (!is.matrix(x)) 
##                 x <- matrix(x, ncol = 1L)
##             if (!is.matrix(y)) 
##                 y <- matrix(y, ncol = 1L)
##             ncx <- ncol(x)
##             ncy <- ncol(y)
##             r <- matrix(0, nrow = ncx, ncol = ncy)
##             for (i in seq_len(ncx)) {
##                 for (j in seq_len(ncy)) {
##                   x2 <- x[, i]
##                   y2 <- y[, j]
##                   ok <- complete.cases(x2, y2)
##                   x2 <- rank(x2[ok])
##                   y2 <- rank(y2[ok])
##                   r[i, j] <- if (any(ok)) 
##                     .Call(C_cor, x2, y2, 1L, method == "kendall")
##                   else NA
##                 }
##             }
##             rownames(r) <- colnames(x)
##             colnames(r) <- colnames(y)
##             if (matrix_result) 
##                 r
##             else drop(r)
##         }
##     }
## }
## <bytecode: 0x7fb7c5bdc9b0>
## <environment: namespace:stats>
#plot from Emily
ggplot(NYweathshoot_06) +
  geom_histogram(aes(x=Shootings), na.rm=TRUE, alpha=0.5, color="black", bins=100, binwidth=2, fill = "green") + 
  labs(title="Distribution of daily NYC shootings (2006-2021)", x="Number of daily Shootings", y="Count")

#Not quite normal-- necessarily right-skewed, but a lot of data points...
#ERG
shoot_Month <- NYweathshoot_06 %>%
  group_by(month) %>%
  summarize("Shootings" = mean(Shootings, na.rm=T),
            "Shooting murders" = mean(Murders, na.rm=T))

# side-by-side bar plot
shoot_Month %>%
  dplyr::select(month, "Shootings", "Shooting murders") %>%
  gather(key="Value", value="Count", "Shootings", "Shooting murders") %>% 
  ggplot(aes(x=month, y=Count, fill=Value)) +
  geom_col(na.rm=TRUE, alpha=0.5, color="black", position="dodge") +
  labs(title="Average NYC daily shootings (2006-2021) by month", x="Month", y="Number of shootings") +
  scale_fill_manual(values=c("red", "blue"))

#making scatter plot of Shootings v TMAX-- ERG Add
ggplot(NYweathshoot_06, aes(x = TMAX, y = Shootings, color = month)) +
    geom_point() +
  #  scale_color_gradient(low="red", high="blue") +   
  #scale_color_manual(values = c("01" = "purple4",
#                                   "07" = "red"), na.value = NA) +
    #geom_abline(aes(intercept = -11.05508, slope = 0.02539), col = "black", size = 1) + 
    labs(
        x = "Year",
        y = "Maximum Daily Temperature",
        title = "Maximum Daily Temperature in Central Park") +
    xlab(label = "TMAX (degrees Fahrenheit)") +
    ylab(label = "# Shootings") +
    ggtitle(label = "Number of NYC Shootings v. TMAX")

#modeling Shootings

#shoot_lm <- lm(formula = Shootings ~ year + month + day + SNOW + PRCP + TMAX + TMIN, data = NYweathshoot_06) 
NYweathshoot_06$month <- as.factor(NYweathshoot_06$month)
#shoot_lm <- lm(formula = Shootings ~ year + month + SNOW + PRCP + TMAX, data = NYweathshoot_06) 
shoot_lm <- lm(formula = Shootings ~ year + SNOW + PRCP + TMAX, data = NYweathshoot_06) 
summary(shoot_lm) 
## 
## Call:
## lm(formula = Shootings ~ year + SNOW + PRCP + TMAX, data = NYweathshoot_06)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
##  -6.62  -2.30  -0.66   1.51  41.45 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 247.78267   19.52641   12.69  < 2e-16 ***
## year         -0.12293    0.00970  -12.68  < 2e-16 ***
## SNOW          0.13802    0.05621    2.46    0.014 *  
## PRCP         -0.55964    0.11194   -5.00  5.9e-07 ***
## TMAX          0.06609    0.00252   26.22  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.42 on 5839 degrees of freedom
## Multiple R-squared:  0.129,  Adjusted R-squared:  0.128 
## F-statistic:  216 on 4 and 5839 DF,  p-value: <2e-16

Chris will look at the local weather data and how it affects human behavior. Will look for correlations between precipitation, temperature, stock market, COVID tests, and crime

First step is to Transform Precip to Yes/No Factor Var. Will use PRCP_TOT to account for all PRCP.

# Add a column to convert PRCP to a binary factor variable. Don't care how much it rains, only if it rains. 
NYweath_prcpFact <-NYweath_final

NYweath_prcpFact$PRCP_factor <- cut(NYweath_final$TOT_PRCP, c(-Inf,0, Inf), labels = c(0,1))
NYweath_prcpFact$PRCP_factor <- as.factor(NYweath_prcpFact$PRCP_factor)

Crime data

Initial import of the data. Due to the size of the data, I imported it once, aggregated the data into a new data frame that only includes date and arrest count. This was exported and saved in the Git. The code below imports directly from that aggregated dataset.

#NYcrime <- data.frame(read.csv("/Users/christopherwasher/Documents/DATS6101/NYPD_Arrests_Data__Historic_.csv"))



#NYcrime_agg <- NYcrime %>% count(ARREST_DATE)

NYcrime_count <- tibble(read.csv("./data/NYPD_Arrests_counts.csv"))

NYcrime_count$ARREST_DATE <- as.Date(NYcrime_count$ARREST_DATE, format = "%Y-%m-%d")
#NYcrime_count$day <- format(NYcrime_count$ARREST_DATE, format="%d")
#NYcrime_count$month <- format(NYcrime_count$ARREST_DATE, format="%m")
#NYcrime_count$year <- format(NYcrime_count$ARREST_DATE, format="%Y")

colnames(NYcrime_count)[2] <- "ARREST_DATE"

head(NYcrime_count)
## # A tibble: 6 × 6
##       X ARREST_DATE NUM_ARREST   day month  year
##   <int> <date>           <int> <int> <int> <int>
## 1     1 2006-01-01         551     1     1  2006
## 2     2 2007-01-01         589     1     1  2007
## 3     3 2008-01-01         769     1     1  2008
## 4     4 2009-01-01         764     1     1  2009
## 5     5 2010-01-01         958     1     1  2010
## 6     6 2011-01-01         900     1     1  2011

Now will do summary statistics and basic EDA on the Crime Count data

crime_plot <- plot(NYcrime_count$ARREST_DATE, NYcrime_count$NUM_ARREST)

crime_boxplot <- boxplot(NYcrime_count$NUM_ARREST)

Add the Crime data to the NY Weather Data, subsetting the weather data to after 1/1/2022

crimeWeather <- subset(NYweath_prcpFact, year >= 2006 & year < 2022)
NYcrime_count <- NYcrime_count[order(NYcrime_count$ARREST_DATE),]

tail(crimeWeather)
##             DATE day month year TMAX TMIN TAVG PRCP SNOW TOT_PRCP PRCP_factor
## 55819 2021-12-26  26    12 2021   51   39   NA 0.00    0     0.00           0
## 55820 2021-12-27  27    12 2021   39   34   NA 0.09    0     0.09           1
## 55821 2021-12-28  28    12 2021   47   36   NA 0.05    0     0.05           1
## 55822 2021-12-29  29    12 2021   44   41   NA 0.14    0     0.14           1
## 55823 2021-12-30  30    12 2021   49   43   NA 0.05    0     0.05           1
## 55824 2021-12-31  31    12 2021   55   48   NA 0.00    0     0.00           0
NYweath_crime <- cbind(crimeWeather, NYcrime_count$NUM_ARREST)
colnames(NYweath_crime)[12] <- c("NUM_ARREST")

#NYweath_crime_plot <- plot(sqrt(NYweath_crime$PRCP), NYweath_crime$NUM_ARREST)
#boxplot((NYweath_crime$TOT_PRCP))

NY_weathcrime_ggplot <- ggplot(NYweath_crime,
                               aes(x = TMAX, y =NUM_ARREST)) + 
  geom_point(aes(colour = PRCP_factor), alpha = 0.5) +
  labs(x = "Temperature (ºF)", y = "Number of Daily Arrests", 
       title = "Weather Patterns for NYC Crime")
NY_weathcrime_ggplot

NY_weathcrime_ggplot2 <- NYweath_crime %>% 
  sample_frac(0.25) %>%
  ggplot(aes(x = TMAX, y =NUM_ARREST)) + 
  geom_point(aes(shape = PRCP_factor, colour = month)) +
  labs(x = "Temperature (ºF)", y = "Number of Daily Arrests", 
       title = "Weather Patterns for NYC Crime")
NY_weathcrime_ggplot2

Initially made a boxplot of precipitation to observe the distribution.. It is extremely skewed. However, because I’m only interested in determining if precipitation has an effect, will build a linear model using PRCP as a Factor.

crimeWeath_lm <- lm(NUM_ARREST ~ TMAX + PRCP_factor + year,  
                    data = NYweath_crime)
crimeWeathMonth_lm <- lm(NUM_ARREST ~ (TMAX + PRCP_factor + year + month), 
                    data = NYweath_crime)
#crimeWeathTMIN_lm <- lm(NUM_ARREST ~ (TMIN + PRCP_factor), 
#                    data = NYweath_crime)

summary(crimeWeathMonth_lm)
## 
## Call:
## lm(formula = NUM_ARREST ~ (TMAX + PRCP_factor + year + month), 
##     data = NYweath_crime)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1029.2  -190.2     0.3   202.7   675.9 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  97442.248   1530.816   63.65  < 2e-16 ***
## TMAX             2.338      0.404    5.79  7.6e-09 ***
## PRCP_factor1   -46.130      7.368   -6.26  4.1e-10 ***
## year           -47.963      0.760  -63.07  < 2e-16 ***
## month02         15.163     17.434    0.87   0.3845    
## month03          3.697     17.531    0.21   0.8330    
## month04        -52.883     19.351   -2.73   0.0063 ** 
## month05        -69.008     21.337   -3.23   0.0012 ** 
## month06       -125.430     23.518   -5.33  1.0e-07 ***
## month07       -160.821     25.009   -6.43  1.4e-10 ***
## month08       -131.473     24.384   -5.39  7.3e-08 ***
## month09       -144.147     22.672   -6.36  2.2e-10 ***
## month10        -82.766     19.830   -4.17  3.0e-05 ***
## month11       -135.518     18.046   -7.51  6.8e-14 ***
## month12       -205.302     17.138  -11.98  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 268 on 5829 degrees of freedom
## Multiple R-squared:  0.427,  Adjusted R-squared:  0.426 
## F-statistic:  310 on 14 and 5829 DF,  p-value: <2e-16

The Linear model of Arrest Numbers as a result of temperature and precipitation. The Coefficients are significant but the R^2 is 0. This indicates there is a statistically significant relationship between Arrests and TMAX and Precipitation but these variables do not explain any of the variability in the data. Increasing TMAX correlated with an increase in Arrests. And PRCP present is associated with a decreased number of arrests.

Stock Market Data

Import the stock market data and convert the date column to a ‘Date’ data type. Also pulled out the ‘day’, ‘month’, and ‘year’ columns to help in analysis.

One last note, will need to fill in the missing date and populated the other columns with ‘NAs’. This will enable us to combine the stocks data with the weather data.

NYstock <- tibble(read.csv("./data/Dow Jones Industrial Average Historical Data.csv"))

tail(NYstock)
## # A tibble: 6 × 7
##   Date     Price     Open      High      Low       Vol.    Change..
##   <chr>    <chr>     <chr>     <chr>     <chr>     <chr>   <chr>   
## 1 11/11/22 33,749.18 33,797.75 33,815.82 33,394.49 418.43M 0.11%   
## 2 11/14/22 33,537.16 33,645.97 33,963.62 33,534.27 339.05M -0.63%  
## 3 11/15/22 33,594.17 33,804.97 33,984.90 33,321.26 380.64M 0.17%   
## 4 11/16/22 33,556.60 33,554.93 33,682.39 33,521.23 288.59M -0.11%  
## 5 11/17/22 33,547.37 33,277.00 33,615.82 33,245.78 301.53M -0.03%  
## 6 11/18/22 33,747.14 33,606.59 33,827.83 33,541.07 296.37M 0.60%
NYstock$Date <- as.Date(NYstock$Date, format = "%m/%d/%y")

NYstock2 <- NYstock
NYstock2 <- NYstock2 %>% 
  complete(Date = seq.Date(min(Date), max(Date), by="day"))

options(scientific=T, digits = 10)

# This is all just test code for figuring out how to clean the data. 
# Not part of final script.
#NYstocktest <- NYstock2
#NYstocktest$Vol. = substr(NYstocktest$Vol.,1,nchar(NYstocktest$Vol.)-1)
#tail(NYstocktest)


#NYstocktest$Price <- gsub(",", "", NYstocktest$Price)
#NYstocktest[3:5] <- lapply(NYstocktest[3:5], gsub, pattern = ",", replacement = "") 
#NYstocktest$Change.. <- gsub("%", "", NYstocktest$Change..)
#NYstocktest[2:7] <- sapply(NYstocktest[2:7], as.numeric)
###

NYstock2$Vol. = substr(NYstock2$Vol., 1, nchar(NYstock2$Vol.) - 1)
NYstock2[2:5] <- lapply(NYstock2[2:5], gsub, pattern = ",", replacement = "") 
NYstock2$Change.. <- gsub("%", "", NYstock2$Change..)
NYstock2[2:7] <- sapply(NYstock2[2:7], as.numeric)

NYstock2$day <- format(NYstock2$Date, format="%d")
NYstock2$month <- format(NYstock2$Date, format="%m")
NYstock2$year <- format(NYstock2$Date, format="%Y")



head(NYstock2)
## # A tibble: 6 × 10
##   Date       Price  Open  High   Low  Vol. Change.. day   month year 
##   <date>     <dbl> <dbl> <dbl> <dbl> <dbl>    <dbl> <chr> <chr> <chr>
## 1 1979-12-25  839.  839.  842.  833.    NA     0    25    12    1979 
## 2 1979-12-26  838.  839.  843.  834.    NA    -0.12 26    12    1979 
## 3 1979-12-27  840.  838.  843.  834.    NA     0.23 27    12    1979 
## 4 1979-12-28  839.  840.  843.  835.    NA    -0.14 28    12    1979 
## 5 1979-12-29   NA    NA    NA    NA     NA    NA    29    12    1979 
## 6 1979-12-30   NA    NA    NA    NA     NA    NA    30    12    1979
summary(NYstock2)
##       Date                Price               Open          
##  Min.   :1979-12-25   Min.   :  759.13   Min.   :  759.980  
##  1st Qu.:1990-09-15   1st Qu.: 2733.83   1st Qu.: 2733.387  
##  Median :2001-06-06   Median : 9447.96   Median : 9438.680  
##  Mean   :2001-06-06   Mean   :10214.49   Mean   :10212.138  
##  3rd Qu.:2012-02-26   3rd Qu.:13171.78   3rd Qu.:13170.677  
##  Max.   :2022-11-18   Max.   :36799.65   Max.   :36722.600  
##                       NA's   :4848       NA's   :4848       
##       High                Low                 Vol.         
##  Min.   :  767.410   Min.   :  729.950   Min.   :  1.5900  
##  1st Qu.: 2747.680   1st Qu.: 2715.142   1st Qu.: 40.3575  
##  Median : 9521.945   Median : 9348.660   Median :152.8850  
##  Mean   :10274.204   Mean   :10147.937   Mean   :168.8159  
##  3rd Qu.:13242.507   3rd Qu.:13098.073   3rd Qu.:257.3475  
##  Max.   :36952.650   Max.   :36636.000   Max.   :922.6800  
##  NA's   :4848        NA's   :4848        NA's   :6878      
##     Change..              day               month               year          
##  Min.   :-22.610000   Length:15670       Length:15670       Length:15670      
##  1st Qu.: -0.460000   Class :character   Class :character   Class :character  
##  Median :  0.050000   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :  0.040395                                                           
##  3rd Qu.:  0.570000                                                           
##  Max.   : 11.370000                                                           
##  NA's   :4848
options(scientific=T, digits = 3) 

Really only care about the volume of data. will remove all other columns and only work with Date + Vol. Will combine that witht he weather data for further analysis.

NYstock_final <- NYstock2[,c("Date", "Vol.")]
NYstock_final <- subset(NYstock_final, Date <= "2022-09-26")
weather_stockDates <- subset(NYweath_prcpFact, DATE >= "1979-12-25")

stockWeather <- cbind(weather_stockDates, NYstock_final)
colnames(stockWeather)[13] <- c("Volume")

Now will do EDA on the the volume data to build a linear regression model. First will look at normality and look for any correlations.

stockWeather_rmNA <- subset(stockWeather, !is.na(Volume))
stock_hist <- hist(stockWeather_rmNA$Volume)

Histogram shows the data is right skewed. Will use sqrt of the volume to normalize.

stockWeather_rmNA$Volume_norm <- sqrt(stockWeather_rmNA$Volume)
stockWeather_rmNA <- subset(stockWeather_rmNA, select = -c(Date))
stockWeather_90s <- subset(stockWeather_rmNA, year >= 1988 & year <= 1999)

hist(stockWeather_rmNA$Volume_norm)

boxplot(stockWeather_rmNA$Volume_norm)

The distribution of sqrt Volume is considerably more normal. Will now look at correlations with Weather data. The boxplot shows there are no outliers after normalizing the data.

pairs.panels(stockWeather_rmNA[c("TMAX", "TOT_PRCP","PRCP_factor",
                                 "Volume","Volume_norm")], 
             method = "pearson", # correlation method
             hist.col = "#00AFBB",
             density = FALSE,  # show density plots
             ellipses = FALSE # show correlation ellipses
             )

There are no strong correlations present in the data. Will next look at a scatter plot of the Stock Volume vs TMAX, categorized by days with PRCP.

NY_weathstock_scatter <- ggplot(stockWeather_rmNA, aes(x = TMAX, y =Volume_norm)) + 
  geom_point(aes(colour = PRCP_factor)) +
  labs(x = "Temperature", y = "Total Daily DOW Trade Volume", 
       title = "Weather Patterns for DOW Trade Volume")
NY_weathstock_scatter

## Trying again with the 90's stock data.

NY_90s_weathstock_scatter <- ggplot(stockWeather_90s, aes(x = TMAX, y =Volume_norm)) + 
  geom_point(aes(colour = PRCP_factor)) +
  labs(x = "Temperature (ºF)", y = "Normalized Daily DOW Trade Volume (M)", 
       title = "Weather Patterns for DOW Trade Volume in the 1990s")
NY_90s_weathstock_scatter

NY_90s_weathstock_scatter2 <- stockWeather_90s %>% 
  sample_frac(0.3) %>%
  ggplot(aes(x = TMAX, y =Volume_norm)) + 
  geom_point(aes(colour = month, shape = PRCP_factor)) +
  labs(x = "Temperature (ºF)", y = "Normalized Daily DOW Trade Volume (M)", 
       title = "Weather Patterns for DOW Trade Volume in the 1990s")
NY_90s_weathstock_scatter2

stock_LM <- lm(Volume_norm ~ TMAX + PRCP_factor + year + month,
               stockWeather_rmNA)
summary(stock_LM)
## 
## Call:
## lm(formula = Volume_norm ~ TMAX + PRCP_factor + year + month, 
##     data = stockWeather_rmNA)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -9.414 -2.203 -0.663  2.724 14.811 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -7.92e+02   7.64e+00 -103.76  < 2e-16 ***
## TMAX         -8.70e-03   4.31e-03   -2.02  0.04389 *  
## PRCP_factor1 -2.84e-02   8.04e-02   -0.35  0.72375    
## year          4.02e-01   3.81e-03  105.40  < 2e-16 ***
## month02      -2.86e-01   1.92e-01   -1.49  0.13696    
## month03       1.53e-01   1.91e-01    0.80  0.42478    
## month04      -2.01e-01   2.12e-01   -0.95  0.34321    
## month05      -3.62e-01   2.31e-01   -1.57  0.11699    
## month06      -2.62e-01   2.54e-01   -1.03  0.30286    
## month07      -5.68e-01   2.71e-01   -2.10  0.03600 *  
## month08      -9.47e-01   2.65e-01   -3.58  0.00035 ***
## month09      -2.91e-01   2.46e-01   -1.18  0.23692    
## month10      -1.11e-01   2.16e-01   -0.52  0.60609    
## month11      -4.67e-01   2.01e-01   -2.33  0.02003 *  
## month12      -6.34e-01   1.90e-01   -3.34  0.00084 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.57 on 8738 degrees of freedom
## Multiple R-squared:  0.562,  Adjusted R-squared:  0.561 
## F-statistic:  801 on 14 and 8738 DF,  p-value: <2e-16
stock_LM_90s <- lm(Volume_norm ~ TMAX + PRCP_factor + year + month,
                   stockWeather_90s)
summary(stock_LM_90s)
## 
## Call:
## lm(formula = Volume_norm ~ TMAX + PRCP_factor + year + month, 
##     data = stockWeather_90s)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -4.575 -0.746 -0.142  0.596  8.329 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -8.62e+02   1.19e+01  -72.61  < 2e-16 ***
## TMAX          5.16e-03   2.31e-03    2.24  0.02541 *  
## PRCP_factor1 -3.52e-02   4.37e-02   -0.81  0.42087    
## year          4.35e-01   5.95e-03   73.10  < 2e-16 ***
## month02      -1.51e-01   1.03e-01   -1.46  0.14481    
## month03      -2.52e-01   1.02e-01   -2.48  0.01322 *  
## month04      -1.40e-01   1.12e-01   -1.24  0.21419    
## month05      -4.00e-01   1.22e-01   -3.29  0.00102 ** 
## month06      -4.46e-01   1.35e-01   -3.31  0.00096 ***
## month07      -4.75e-01   1.44e-01   -3.31  0.00095 ***
## month08      -4.18e-01   1.39e-01   -3.00  0.00270 ** 
## month09      -2.52e-01   1.29e-01   -1.96  0.05027 .  
## month10       8.14e-02   1.13e-01    0.72  0.47338    
## month11       4.88e-02   1.06e-01    0.46  0.64464    
## month12      -2.97e-02   1.01e-01   -0.30  0.76783    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.13 on 3018 degrees of freedom
## Multiple R-squared:  0.641,  Adjusted R-squared:  0.64 
## F-statistic:  385 on 14 and 3018 DF,  p-value: <2e-16

The Linear Model that incorporates all the Stock data from 1988-present had a statistically significant TMAX, year, and some of the months.

The stock data subset to the 90s generated a similar model.

Find the confidence intervals of the coefficients.

COVID Data

Looking at the effect of precipitation and temperature on the number of positive COVID cases. Using the “CASE_COUNT” parameter for the NYC Covid dataset. CASE_COUNT represents the count of patients tested who were confirmed to be COVID-19 cases on date_of_interest

options(scientific=T, digits = 3) 
NYcovid <- tibble(read.csv("./data/COVID-19_Daily_Counts_of_Cases__Hospitalizations__and_Deaths.csv"))

NYcovid <- select(NYcovid, 1:3)

head(NYcovid)
## # A tibble: 6 × 3
##   date_of_interest CASE_COUNT PROBABLE_CASE_COUNT
##   <chr>                 <int>               <int>
## 1 02/29/2020                1                   0
## 2 03/01/2020                0                   0
## 3 03/02/2020                0                   0
## 4 03/03/2020                1                   0
## 5 03/04/2020                5                   0
## 6 03/05/2020                3                   0
colnames(NYcovid)[1] <- "DATE"

NYcovid$DATE <- as.Date(NYcovid$DATE, format = "%m/%d/%Y")
NYcovid$day <- format(NYcovid$DATE, format="%d")
NYcovid$month <- format(NYcovid$DATE, format="%m")
NYcovid$year <- format(NYcovid$DATE, format="%Y")

head(NYcovid)
## # A tibble: 6 × 6
##   DATE       CASE_COUNT PROBABLE_CASE_COUNT day   month year 
##   <date>          <int>               <int> <chr> <chr> <chr>
## 1 2020-02-29          1                   0 29    02    2020 
## 2 2020-03-01          0                   0 01    03    2020 
## 3 2020-03-02          0                   0 02    03    2020 
## 4 2020-03-03          1                   0 03    03    2020 
## 5 2020-03-04          5                   0 04    03    2020 
## 6 2020-03-05          3                   0 05    03    2020
summary(NYcovid)
##       DATE              CASE_COUNT    PROBABLE_CASE_COUNT     day           
##  Min.   :2020-02-29   Min.   :    0   Min.   :   0        Length:991        
##  1st Qu.:2020-11-02   1st Qu.:  574   1st Qu.:  76        Class :character  
##  Median :2021-07-08   Median : 1437   Median : 343        Mode  :character  
##  Mean   :2021-07-08   Mean   : 2534   Mean   : 478                          
##  3rd Qu.:2022-03-12   3rd Qu.: 2796   3rd Qu.: 644                          
##  Max.   :2022-11-15   Max.   :54947   Max.   :5882                          
##     month               year          
##  Length:991         Length:991        
##  Class :character   Class :character  
##  Mode  :character   Mode  :character  
##                                       
##                                       
## 

Next, Looked at normality of the COVID count data. The counts were extremely skewed to the right. First removed multiple rounds of outliers using the outlierKD2 funciton. After removing all outliers, the data was still skewed right but less extreme. Used a square-root transform to normalize the data.

covid_plot <- plot(NYcovid$DATE, NYcovid$CASE_COUNT)

covid_boxplot <- boxplot(NYcovid$CASE_COUNT)

NYcovid_rmOuts <- outlierKD2(NYcovid, CASE_COUNT, rm=TRUE, boxplt=TRUE, histogram=TRUE, qqplt=TRUE)

## Outliers identified: 42 
## Proportion (%) of outliers: 4.4 
## Mean of the outliers: 21841 
## Mean without removing outliers: 2534 
## Mean if we remove outliers: 1680 
## Outliers successfully removed
NYcovid_rmOuts2 <- outlierKD2(NYcovid_rmOuts, CASE_COUNT, rm=TRUE, boxplt=TRUE, histogram=TRUE, qqplt=TRUE)

## Outliers identified: 15 
## Proportion (%) of outliers: 1.6 
## Mean of the outliers: 5696 
## Mean without removing outliers: 1680 
## Mean if we remove outliers: 1615 
## Outliers successfully removed
NYcovid_rmOuts3 <- outlierKD2(NYcovid_rmOuts2, CASE_COUNT, rm=TRUE, boxplt=TRUE, histogram=TRUE, qqplt=TRUE)

## Outliers identified: 7 
## Proportion (%) of outliers: 0.8 
## Mean of the outliers: 5200 
## Mean without removing outliers: 1615 
## Mean if we remove outliers: 1588 
## Outliers successfully removed
NYcovid_rmOuts4 <- outlierKD2(NYcovid_rmOuts3, CASE_COUNT, rm=TRUE, boxplt=TRUE, histogram=TRUE, qqplt=TRUE)

## Outliers identified: 2 
## Proportion (%) of outliers: 0.2 
## Mean of the outliers: 5094 
## Mean without removing outliers: 1588 
## Mean if we remove outliers: 1581 
## Outliers successfully removed
covid_plot <- plot(NYcovid_rmOuts4$DATE, NYcovid_rmOuts4$CASE_COUNT)
tail(NYcovid_rmOuts3)
## # A tibble: 6 × 6
##   DATE       CASE_COUNT PROBABLE_CASE_COUNT day   month year 
##   <date>          <int>               <int> <chr> <chr> <chr>
## 1 2022-11-10       1957                 621 10    11    2022 
## 2 2022-11-11       1580                 395 11    11    2022 
## 3 2022-11-12       1477                 539 12    11    2022 
## 4 2022-11-13       1332                 514 13    11    2022 
## 5 2022-11-14       2861                 800 14    11    2022 
## 6 2022-11-15       2120                 652 15    11    2022
sqrt_count <- sqrt(NYcovid_rmOuts3$CASE_COUNT)
#hist(sqrt_count)

NYcovid_final <- cbind(NYcovid_rmOuts4, sqrt_count)
head(NYcovid_final)
##         DATE CASE_COUNT PROBABLE_CASE_COUNT day month year sqrt_count
## 1 2020-02-29          1                   0  29    02 2020       1.00
## 2 2020-03-01          0                   0  01    03 2020       0.00
## 3 2020-03-02          0                   0  02    03 2020       0.00
## 4 2020-03-03          1                   0  03    03 2020       1.00
## 5 2020-03-04          5                   0  04    03 2020       2.24
## 6 2020-03-05          3                   0  05    03 2020       1.73

Add the Covid data to the NY Weather Data, subsetting the weather data to after 2/29/2022

covWeather <- subset(NYweath_prcpFact, DATE >= ("2020-02-29"))
NYcovid_finaldates <- subset(NYcovid_final, DATE <= "2022-09-26")
tail(covWeather)
##             DATE day month year TMAX TMIN TAVG PRCP SNOW TOT_PRCP PRCP_factor
## 56088 2022-09-21  21    09 2022   80   62   NA 0.00    0     0.00           0
## 56089 2022-09-22  22    09 2022   75   57   NA 0.47    0     0.47           1
## 56090 2022-09-23  23    09 2022   63   51   NA 0.00    0     0.00           0
## 56091 2022-09-24  24    09 2022   69   49   NA 0.00    0     0.00           0
## 56092 2022-09-25  25    09 2022   72   59   NA 1.11    0     1.11           1
## 56093 2022-09-26  26    09 2022   74   58   NA 0.00    0     0.00           0
NYweath_prcpCov <- cbind(covWeather, NYcovid_finaldates$CASE_COUNT,
                         NYcovid_finaldates$sqrt_count)
colnames(NYweath_prcpCov)[12:13] <- c("CASE_COUNT", "sqrt_count")

covCount_prcp_plot <- plot(NYweath_prcpCov$sqrt_count, sqrt(NYweath_prcpCov$PRCP))

NYweath_cov_final <- NYweath_prcpCov[,c(1:5, 8, 10:13)]

Plot of COV case count vs precipitation. no apparent relationship, however, more interested in effect of precip not so much about the correlation in prcp

T-test comparing Covid positive counts on days with precipitation vs days without prcp.

cov_prcp1 <- subset(NYweath_cov_final, PRCP_factor == 1)
cov_prcp0 <- subset(NYweath_cov_final, PRCP_factor == 0)



cov_count_ttest <- t.test(cov_prcp0$sqrt_count, cov_prcp1$sqrt_count)
cov_count_ttest
## 
##  Welch Two Sample t-test
## 
## data:  cov_prcp0$sqrt_count and cov_prcp1$sqrt_count
## t = 0.2, df = 650, p-value = 0.8
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -2.02  2.49
## sample estimates:
## mean of x mean of y 
##      36.3      36.1
cov_count_bplot <- ggplot()+
  geom_boxplot(data = NYweath_cov_final,
               aes(y = sqrt_count, x = PRCP_factor)) +
  labs(title = "COVID Positive Counts")

cov_count_bplot

## Repeating this EDA looking only at Covid cases from 2021+. 
cov_2021 <- subset(NYweath_cov_final, year >= 2021)

cov_2021count_bplot <- ggplot()+
  geom_boxplot(data = cov_2021, aes(y = sqrt_count, x = PRCP_factor)) +
  labs(title = "2021 COVID Positive Counts")
cov_2021count_bplot

cov_2021prcp1 <- subset(cov_2021, PRCP_factor == 1)
cov_2021prcp0 <- subset(cov_2021, PRCP_factor == 0)

cov_2021count_ttest <- t.test(cov_2021prcp0$sqrt_count, cov_2021prcp1$sqrt_count)
cov_2021count_ttest
## 
##  Welch Two Sample t-test
## 
## data:  cov_2021prcp0$sqrt_count and cov_2021prcp1$sqrt_count
## t = 1, df = 414, p-value = 0.2
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -1.06  4.10
## sample estimates:
## mean of x mean of y 
##      40.2      38.7

No significant difference in the mean Covid case counts on days with precipitation or without. However, there was a greater difference in the means when only incorporating Covid from 2021+.

covWeath_final_scatter <- ggplot(NYweath_cov_final, 
                                 aes(x = TMAX, 
                                     y =sqrt_count,
                                     )) + 
  geom_point(aes(colour = month, shape = PRCP_factor)) +
  labs(x = "Temperature", 
       y = "Square Root of Total Daily DOW Trade Volume", 
       title = "Weather Patterns for Covid Case Counts")
covWeath_final_scatter

Will now build a linear model that incorporates temperature, precipitation, and Month to predict Covid counts.

library(psych)


pairs.panels(NYweath_cov_final[4:10], 
             method = "pearson", # correlation method
             hist.col = "#00AFBB",
             density = FALSE,  # show density plots
             ellipses = FALSE # show correlation ellipses
             )

cov_weathLM <- lm(sqrt_count ~ TMAX + PRCP_factor + year,
                  data = NYweath_cov_final)
summary(cov_weathLM)
## 
## Call:
## lm(formula = sqrt_count ~ TMAX + PRCP_factor + year, data = NYweath_cov_final)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -38.45  -9.44  -1.96  10.18  42.67 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -1.51e+04   1.25e+03  -12.09   <2e-16 ***
## TMAX         -3.11e-01   2.87e-02  -10.82   <2e-16 ***
## PRCP_factor1 -4.84e-01   1.01e+00   -0.48     0.63    
## year          7.49e+00   6.17e-01   12.13   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 14.3 on 873 degrees of freedom
##   (64 observations deleted due to missingness)
## Multiple R-squared:  0.23,   Adjusted R-squared:  0.227 
## F-statistic: 86.8 on 3 and 873 DF,  p-value: <2e-16
cov_weathLM_month <- lm(sqrt_count ~ TMAX + PRCP_factor + year + month,
                  data = NYweath_cov_final)
summary(cov_weathLM_month)
## 
## Call:
## lm(formula = sqrt_count ~ TMAX + PRCP_factor + year + month, 
##     data = NYweath_cov_final)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -41.67  -8.91  -0.68   8.14  41.26 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -1.56e+04   1.22e+03  -12.74  < 2e-16 ***
## TMAX         -5.53e-02   5.57e-02   -0.99   0.3210    
## PRCP_factor1 -6.30e-01   9.33e-01   -0.68   0.4994    
## year          7.74e+00   6.05e-01   12.79  < 2e-16 ***
## month02      -1.87e+01   3.03e+00   -6.16  1.1e-09 ***
## month03      -1.70e+01   2.99e+00   -5.67  1.9e-08 ***
## month04      -9.47e+00   3.15e+00   -3.01   0.0027 ** 
## month05      -1.91e+01   3.41e+00   -5.62  2.6e-08 ***
## month06      -2.63e+01   3.76e+00   -7.00  5.0e-12 ***
## month07      -2.15e+01   3.94e+00   -5.46  6.2e-08 ***
## month08      -2.07e+01   3.89e+00   -5.32  1.3e-07 ***
## month09      -2.32e+01   3.62e+00   -6.39  2.6e-10 ***
## month10      -2.54e+01   3.43e+00   -7.40  3.3e-13 ***
## month11      -1.66e+01   3.22e+00   -5.15  3.2e-07 ***
## month12       2.09e+00   3.31e+00    0.63   0.5289    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 13.2 on 862 degrees of freedom
##   (64 observations deleted due to missingness)
## Multiple R-squared:  0.358,  Adjusted R-squared:  0.347 
## F-statistic: 34.3 on 14 and 862 DF,  p-value: <2e-16
cov2021_weathLM <- lm(CASE_COUNT ~ TMAX + PRCP_factor + year + month,
                  data = cov_2021)
summary(cov2021_weathLM)
## 
## Call:
## lm(formula = CASE_COUNT ~ TMAX + PRCP_factor + year + month, 
##     data = cov_2021)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
##  -3423   -829    -67    548   3129 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -1.14e+06   1.92e+05   -5.94  5.0e-09 ***
## TMAX          2.11e-01   5.45e+00    0.04   0.9692    
## PRCP_factor1 -7.93e+01   9.24e+01   -0.86   0.3908    
## year          5.66e+02   9.50e+01    5.96  4.5e-09 ***
## month02      -1.73e+03   2.47e+02   -7.01  7.0e-12 ***
## month03      -1.81e+03   2.57e+02   -7.03  5.9e-12 ***
## month04      -1.82e+03   2.78e+02   -6.53  1.5e-10 ***
## month05      -1.84e+03   3.07e+02   -6.00  3.6e-09 ***
## month06      -2.28e+03   3.40e+02   -6.70  5.1e-11 ***
## month07      -1.81e+03   3.57e+02   -5.05  5.9e-07 ***
## month08      -1.89e+03   3.58e+02   -5.29  1.8e-07 ***
## month09      -2.28e+03   3.30e+02   -6.92  1.2e-11 ***
## month10      -2.61e+03   3.25e+02   -8.03  5.7e-15 ***
## month11      -2.36e+03   2.92e+02   -8.07  4.2e-15 ***
## month12      -1.02e+03   3.74e+02   -2.73   0.0064 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1060 on 565 degrees of freedom
##   (54 observations deleted due to missingness)
## Multiple R-squared:  0.244,  Adjusted R-squared:  0.225 
## F-statistic:   13 on 14 and 565 DF,  p-value: <2e-16

The linear model that only incorporates TMAX, PRCP_factor, and year has statistically significant coefficients for TMAX and year. This indicates the model predicts that the sqrt of covid counts decreases by 0.3 for degree F increase in TMAX.

However, when we account for month, we lose the significance in the TMAX variable. This indicates that the covid cases are more effected by the seasonal changes rather than Temperature.

Let’s try to graph this! That did not work!

#covLM_plot <- covWeath_final_scatter + 
 # geom_smooth(method = lm, se = FALSE, fullrange = TRUE,
              #aes(colour = PRCP_factor))
#covLM_plot
  
#ggplt
  
# Plotting multiple Regression Lines
#ggplt+geom_smooth(method=lm,se=FALSE,fullrange=TRUE,
   #               aes(color=Tree))

#covLM_plot

LR to predict Precipitation!

Let’s start by ingesting the Air Quality to add into this prediction in place of COVID.

#DailyAQ_00_22 <- data.frame(read.csv("data/daily-AQ-NY-00-20.csv"))
#DailyAQ_00_22 <- DailyAQ_00_22[c('Date', 'Daily.Mean.PM2.5.Concentration', #'DAILY_AQI_VALUE')]
#colnames(DailyAQ_00_22) <- c('DATE', 'PM2.5', 'AQI')
#str(DailyAQ_00_22)
#xkablesummary(DailyAQ_00_22)
#xkabledplyhead(DailyAQ_00_22)

Now let’s build a master dataframe that incorporates Date, Year, Month, TMAX, PRCP, PRCP_Factor, Crime Count, DOW Volume, PM2.5, and AQI.

# FORMAT AQ Data and subset dates
#AQ_forLogit <- DailyAQ_00_22
#AQ_forLogit$DATE <- as.Date(AQ_forLogit$DATE, format = "%m/%d/%y")
#AQ_forLogit$day <- format(AQ_forLogit$DATE, format="%d")
#AQ_forLogit$month <- format(AQ_forLogit$DATE, format="%m")
#AQ_forLogit$year <- format(AQ_forLogit$DATE, format="%Y")
#AQ_forLogit$year <- as.numeric(AQ_forLogit$year)

#AQ_forLogit2 <- AQ_forLogit %>% 
 # complete(DATE = seq.Date(min(DATE), max(DATE), by="day"))

#AQ_masterDates <- subset(AQ_forLogit2, year >= 2006 & year < 2022)


stock_masterDates <- subset(NYstock_final,Date >= "2006-01-01" &
                              Date <= "2021-12-31")

crime_masterDates <- NYcrime_count

weath_masterDates <- subset(NYweath_prcpFact, year >= 2006 & year < 2022) 


master_log <- cbind(weath_masterDates,
                    crime_masterDates$NUM_ARREST,
                    stock_masterDates$Vol.)
colnames(master_log)[12:13] <- c('NUM_ARREST', 'Volume')

head(master_log)
##             DATE day month year TMAX TMIN TAVG PRCP SNOW TOT_PRCP PRCP_factor
## 49981 2006-01-01  01    01 2006   42   32   NA 0.00    0     0.00           0
## 49982 2006-01-02  02    01 2006   48   39   NA 0.63    0     0.63           1
## 49983 2006-01-03  03    01 2006   40   33   NA 1.13    0     1.13           1
## 49984 2006-01-04  04    01 2006   38   29   NA 0.00    0     0.00           0
## 49985 2006-01-05  05    01 2006   50   37   NA 0.05    0     0.05           1
## 49986 2006-01-06  06    01 2006   43   30   NA 0.00    0     0.00           0
##       NUM_ARREST Volume
## 49981        551     NA
## 49982        618     NA
## 49983        899    303
## 49984       1229    271
## 49985       1383    251
## 49986       1248    292
master_logFinal <- subset(master_log, !is.na(Volume))
master_logFinal$Volume_norm <- sqrt(master_logFinal$Volume)

Now let’s build the LR:

prcp_logit <- glm(PRCP_factor ~ TMAX + NUM_ARREST +
                    Volume_norm + year + month,
                  data = master_logFinal,
                  family = binomial(link = "logit"))

summary(prcp_logit)
## 
## Call:
## glm(formula = PRCP_factor ~ TMAX + NUM_ARREST + Volume_norm + 
##     year + month, family = binomial(link = "logit"), data = master_logFinal)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -1.295  -0.957  -0.844   1.350   1.707  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) 72.931220  20.475973    3.56  0.00037 ***
## TMAX        -0.005733   0.003813   -1.50  0.13272    
## NUM_ARREST  -0.000950   0.000142   -6.67  2.6e-11 ***
## Volume_norm -0.010261   0.008914   -1.15  0.24972    
## year        -0.035885   0.010121   -3.55  0.00039 ***
## month02      0.242612   0.168088    1.44  0.14892    
## month03      0.149488   0.169288    0.88  0.37722    
## month04      0.400139   0.185421    2.16  0.03093 *  
## month05      0.326598   0.205157    1.59  0.11140    
## month06      0.503649   0.222956    2.26  0.02389 *  
## month07      0.344674   0.241819    1.43  0.15406    
## month08      0.201185   0.236342    0.85  0.39463    
## month09     -0.049412   0.222747   -0.22  0.82445    
## month10      0.074118   0.193259    0.38  0.70134    
## month11     -0.012329   0.178675   -0.07  0.94499    
## month12      0.046996   0.169456    0.28  0.78152    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 5256.9  on 4027  degrees of freedom
## Residual deviance: 5185.4  on 4012  degrees of freedom
## AIC: 5217
## 
## Number of Fisher Scoring iterations: 4

Let’s assess the LR!

library(ModelMetrics)
prcpLR_cm <- confusionMatrix(actual = prcp_logit$y, 
                  predicted = prcp_logit$fitted.values)
prcpLR_cm
##      [,1] [,2]
## [1,] 2541 1399
## [2,]   43   45
prcpLR_acc <- (prcpLR_cm[2,2] + prcpLR_cm[1,1])/(sum(prcpLR_cm))
prcpLR_prec <- (prcpLR_cm[2,2])/(prcpLR_cm[2,2]+prcpLR_cm[1,2])
prcpLR_rec <- (prcpLR_cm[2,2])/(prcpLR_cm[2,2]+prcpLR_cm[2,1])
prcpLR_spec <- (prcpLR_cm[1,1])/(prcpLR_cm[1,1]+prcpLR_cm[1,2])

library(pROC)

master_logFinal$prob=predict(prcp_logit, type = c("response"))
prcp_roc <- roc(PRCP_factor ~ prob, data = master_logFinal)
prcp_auc <- auc(prcp_roc)
prcp_auc
## Area under the curve: 0.575
plot(prcp_roc)

library(pscl)
prcp_pr2 <- pR2(prcp_logit)
## fitting null model for pseudo-r2
prcp_pr2
##       llh   llhNull        G2  McFadden      r2ML      r2CU 
## -2.59e+03 -2.63e+03  7.15e+01  1.36e-02  1.76e-02  2.41e-02

This is NOT a good logistic regression!!!

Summary of Key Findings

We have statistically significant correlations on models using weather, air quality, and human activity data from NYC, but none of our models demonstrate high predictive potential.

Challenges with data – data were not ideal for our initial hypotheses– cannot reject or accept. Would need better data!

Cannot model seasonality with linear regressions. Potentially need to use seasonality-adjusted time-series models for better outcomes.

What data would enable us to answer the question if we had more time?